In [ ]:
# <h1>Table of Contents<span class="tocSkip"></span></h1>
# <div class="toc"><ul class="toc-item"><li><span><a href="#Import-dep" data-toc-modified-id="Import-dep-1"><span class="toc-item-num">1 </span>Import dep</a></span></li><li><span><a href="#Example-4.1" data-toc-modified-id="Example-4.1-2"><span class="toc-item-num">2 </span>Example 4.1</a></span><ul class="toc-item"><li><span><a href="#Ex-4.1" data-toc-modified-id="Ex-4.1-2.1"><span class="toc-item-num">2.1 </span>Ex 4.1</a></span></li><li><span><a href="#Ex-4.2" data-toc-modified-id="Ex-4.2-2.2"><span class="toc-item-num">2.2 </span>Ex 4.2</a></span></li><li><span><a href="#Ex-4.4" data-toc-modified-id="Ex-4.4-2.3"><span class="toc-item-num">2.3 </span>Ex 4.4</a></span></li></ul></li><li><span><a href="#Ex-4.5" data-toc-modified-id="Ex-4.5-3"><span class="toc-item-num">3 </span>Ex 4.5</a></span></li><li><span><a href="#Ex-4.7" data-toc-modified-id="Ex-4.7-4"><span class="toc-item-num">4 </span>Ex 4.7</a></span><ul class="toc-item"><li><span><a href="#Replicate-Fig.-4.2" data-toc-modified-id="Replicate-Fig.-4.2-4.1"><span class="toc-item-num">4.1 </span>Replicate Fig. 4.2</a></span></li><li><span><a href="#Add-more-details" data-toc-modified-id="Add-more-details-4.2"><span class="toc-item-num">4.2 </span>Add more details</a></span></li></ul></li></ul></div>
In [1]:
from IPython.display import display, HTML
# Set the notebook width to 80%
display(HTML("<style>.container { width: 80% !important; }</style>"))
In [2]:
!jupyter notebook list
In [3]:
# Needs to paste `http://localhost:3110`, no ending `/`
port = 3710
import IPython
import json
import requests
hostname = !hostname
# Get the current Jupyter server's info
result = !jupyter notebook list
for res in result:
if f'http://localhost:{port}/' in res:
result = res.split(' :: ')[0]
break
# Print the server URL
print(f'Current Jupyter server {hostname} URL: {result}')
# Get the list of running notebooks
response = requests.get(f'{result}api/sessions')
# # Convert the JSON data to a string and print it
# print(json.dumps(response.json(), indent=4))
nbs = response.json()
nb_names = [nb['name'] for nb in nbs]
print(len(nb_names), nb_names)
1. Import dep¶
In [4]:
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
In [5]:
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "notebook"
In [6]:
COLOR_LIST = plotly.colors.DEFAULT_PLOTLY_COLORS
len(COLOR_LIST)
Out[6]:
In [7]:
print(plotly.__version__, plotly.__path__)
2. Example 4.1¶
In [8]:
def state_val_update1(vnn, ga):
nr, nc = vnn.shape
# Using the vnn values below to use the newly obtained v_{n+1} values.
for i, j in product(range(nr), range(nc)):
if (i==0 and j==0) or (i==nr-1 and j==nc-1):
vnn[i, j] = 0
else:
# Corners
if i==0 and j==nc-1:
vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j], ga*vnn[i, j], ga*vnn[i+1, j]])/4
elif i==nr-1 and j==0:
vnn[i, j] = -1+sum([ga*vnn[i, j], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i, j]])/4
# Boundaries
elif i==0:
vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j+1], ga*vnn[i, j], ga*vnn[i+1, j]])/4
elif i==nr-1:
vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i, j]])/4
elif j==0:
vnn[i, j] = -1+sum([ga*vnn[i, j], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i+1, j]])/4
elif j==nc-1:
vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j], ga*vnn[i-1, j], ga*vnn[i+1, j]])/4
# Inner
else:
vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i+1, j]])/4
def eval_schema2(
state_val_update_func,
v0: np.ndarray, ga: float, max_iter: int, memo_step: int = 1, plt_trace=False, trace_ij=(0, 0)
):
"""
When updating the value function at n+1, newly calcualted n+1 values are also used.
RLAI book call this "in-place" algorithm.
"""
vn = v0.copy()
memo_v = {}
for si in range(max_iter):
vnn = vn.copy()
state_val_update_func(vnn, ga)
if si%memo_step==0:
memo_v[si] = vn
vn = vnn
memo_v[max_iter] = vn
if plt_trace:
fig = make_subplots(rows=1, cols=1)
arr_si = np.array(sorted(memo_v.keys()))
vs = [memo_v[si][trace_ij] for si in arr_si]
fig.add_trace(go.Scatter(x=arr_si+1, y=vs, mode='lines+markers'), row=1, col=1)
fig.update_layout(
xaxis=dict(title='Iteration', type='log'),
title=f'Value Function at Iteration {trace_ij}', height=600, width=600)
fig.show()
return memo_v
def pr_val_func(val: np.ndarray, fmt='%8.3f', reshape=False, nr=None, nc=None):
if reshape:
print(np.array2string(val.reshape(nr, nc, order='C'), formatter={'float_kind':lambda x: fmt % x}))
else:
print(np.array2string(val, formatter={'float_kind':lambda x: fmt % x}))
2.1. Ex 4.1¶
In [ ]:
v0 = np.zeros((4, 4))
ga = 1.0
max_iter = 1000
memo_step = 10
memo_v = eval_schema2(state_val_update1, v0, ga, max_iter, memo_step, plt_trace=True, trace_ij=(1, 3))
pr_val_func(memo_v[max_iter])
In [ ]:
q_11_down = -1+memo_v[max_iter][3, 3]
q_7_down = -1+memo_v[max_iter][2, 3]
q_11_down, q_7_down
Out[ ]:
2.2. Ex 4.2¶
In [ ]:
def state_val_update2(vnn, ga):
def _ij(i, j):
return i*4+j
nr, nc = 4, 4
# Using the vnn values below to use the newly obtained v_{n+1} values.
for i, j in product(range(nr), range(nc)):
if (i==0 and j==0) or (i==nr-1 and j==nc-1):
vnn[_ij(i, j)] = 0
else:
# Corners
if i==0 and j==nc-1:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i+1, j)]])/4
elif i==nr-1 and j==0:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i, j)]])/4
# Boundaries
elif i==0:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i+1, j)]])/4
elif i==nr-1:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i, j)]])/4
elif j==0:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i+1, j)]])/4
elif j==nc-1:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i+1, j)]])/4
# Inner
else:
vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i+1, j)]])/4
vnn[-1] = -1+sum([ga*vnn[_ij(3, 0)], ga*vnn[_ij(3, 1)], ga*vnn[_ij(3, 2)], ga*vnn[-1]])/4
In [ ]:
v0 = np.zeros((17,))
ga = 1.0
max_iter = 1000
memo_step = 10
memo_v = eval_schema2(state_val_update2, v0, ga, max_iter, memo_step, plt_trace=True, trace_ij=-1)
pr_val_func(memo_v[max_iter][:16], reshape=True, nr=-1, nc=4)
memo_v[max_iter][-1]
Out[ ]:
2.3. Ex 4.4¶
- The bug is that the optimal policy is not unique. So even though the policy improvement shows that the
policy-stable=false, the algorithm may not give a better state-value function in the next policy evaluation. Actually, the algorithm can circulating between different optimal policies. (In other words, ties should be broken in a consistent order.)- One way to fix it is that in policy improvement, we don't use the current way to assign $\pi(s)$. Instead, we check if there is an action giving a better value of existing $p(s, a)$. If so, we assign the action to the $\pi(s)$ and assign
policy-stable=false, otherwise we don't do anything for the current $\pi(s)$. - Another way of fixing is that, within policy evaluation, we can also check if the current state-value function is close to the state-value function in the last time policy evaluation. If there is a strict improvement in
- One way to fix it is that in policy improvement, we don't use the current way to assign $\pi(s)$. Instead, we check if there is an action giving a better value of existing $p(s, a)$. If so, we assign the action to the $\pi(s)$ and assign
3. Ex 4.5¶
\begin{align*} q_{\pi}(s, a) &= \mathbb{E}[R_{t+1}+\gamma*v_{\pi}(S_{t+1})|s, a] \\ &= \mathbb{E}[R_{t+1}+\gamma*\sum_{a}\pi(a|S_{t+1})q_{\pi}(S_{t+1}, a)|s, a] \\ &= \sum_{s', r}p(s',r|s,a)(r+\gamma*\sum_{a}\pi(a|s')q_{\pi}(s', a)) \end{align*}
At each time step $k$, we have $q_k(s,a)$. Then, we can use this $q_k(s,a)$ to evaluate $q_{k+1}(s,a)$ using the equation above. This is policy evaluation for the action-value function.
4. Ex 4.7¶
4.1. Replicate Fig. 4.2¶
- The solution is the same with Fig. 4.2.
In [25]:
from itertools import product
import math
import time
from joblib import Parallel, delayed, parallel_backend
N_JOBS = 20
MV_COST = 2
RENT_RWD = 10
ARR1, ARR2 = 3, 4
RET1, RET2 = 3, 2
MAX_CARS = 20
MAX_CAR_MOVE = 5
POISSON_CACHE_ARR1 = [np.exp(-ARR1)*ARR1**arr1/math.factorial(arr1) for arr1 in range(MAX_CARS)]
POISSON_CACHE_CUMU_ARR1 = np.cumsum(POISSON_CACHE_ARR1)
POISSON_CACHE_ARR2 = [np.exp(-ARR2)*ARR2**arr2/math.factorial(arr2) for arr2 in range(MAX_CARS)]
POISSON_CACHE_CUMU_ARR2 = np.cumsum(POISSON_CACHE_ARR2)
POISSON_CACHE_RET1 = [np.exp(-RET1)*RET1**ret1/math.factorial(ret1) for ret1 in range(MAX_CARS)]
POISSON_CACHE_CUMU_RET1 = np.cumsum(POISSON_CACHE_RET1)
POISSON_CACHE_RET2 = [np.exp(-RET2)*RET2**ret2/math.factorial(ret2) for ret2 in range(MAX_CARS)]
POISSON_CACHE_CUMU_RET2 = np.cumsum(POISSON_CACHE_RET2)
# def car_rental_trans(c1, c2, mv, c1n, c2n):
# """
# Calculate the transition probability and expected reward for the car rental problem from
# state (c1, c2) to state (c1n, c2n) by moving mv cars from location 1 to location 2.
# c1 is the number of cars at location 1.
# c2 is the number of cars at location 2.
# mv is the number of cars moved from location 1 to location 2.
# c1n is the number of cars at location 1 at the end of the next day.
# c2n is the number of cars at location 2 at the end of the next day.
# """
# if ((mv>0 and c1<mv) or (mv<0 and c2<-mv)):
# return 0.0, 0.0
# c1_be = min(MAX_CARS, c1-mv)
# c2_be = min(MAX_CARS, c2+mv)
# # c1n = min(MAX_CARS, c1_be-min(c1_be, arr1)+ret1)
# r_sa_sp, p_sa_sp = 0, 0
# for arr1, arr2 in product(range(c1_be+1), range(c2_be+1)):
# ret1 = c1n-(c1_be-arr1)
# ret2 = c2n-(c2_be-arr2)
# if ret1<0 or ret2<0:
# continue
# p_arr1 = POISSON_CACHE_ARR1[arr1] if arr1<c1_be else 1-POISSON_CACHE_CUMU_ARR1[c1_be-1]
# p_arr2 = POISSON_CACHE_ARR2[arr2] if arr2<c2_be else 1-POISSON_CACHE_CUMU_ARR2[c2_be-1]
# p_ret1 = POISSON_CACHE_RET1[ret1] if c1n<MAX_CARS else 1-POISSON_CACHE_CUMU_RET1[ret1-1]
# p_ret2 = POISSON_CACHE_RET2[ret2] if c2n<MAX_CARS else 1-POISSON_CACHE_CUMU_RET2[ret2-1]
# prob = p_arr1*p_arr2*p_ret1*p_ret2
# r_sa_sp += RENT_RWD*(arr1+arr2)*prob
# p_sa_sp += prob
# r_sa_sp -= MV_COST*abs(mv)*p_sa_sp
# return r_sa_sp, p_sa_sp
def car_rental_one_loc_trans(c_be, c_ed, loc):
"""
Calculate the transition probability and expected reward for the car rental problem from
state c_be to state c_ed.
c_be is the number of cars at the beginning of the day.
c_ed is the number of cars at the end of the day.
loc is the location of the cars.
"""
r_sa_sp, p_sa_sp = 0, 0
# arr>=c_be-c_ed
arr_arr = np.arange(max(0, c_be-c_ed), c_be+1)
pois_cache_arr = POISSON_CACHE_ARR1 if loc==1 else POISSON_CACHE_ARR2
pois_cache_cum_arr = POISSON_CACHE_CUMU_ARR1 if loc==1 else POISSON_CACHE_CUMU_ARR2
pois_cache_ret = POISSON_CACHE_RET1 if loc==1 else POISSON_CACHE_RET2
pois_cache_cum_ret = POISSON_CACHE_CUMU_RET1 if loc==1 else POISSON_CACHE_CUMU_RET2
# There can be a subtle bug here when c_be=0
p_arr = np.array([
pois_cache_arr[arr] if arr<c_be else ((1-pois_cache_cum_arr[c_be-1]) if c_be>0 else 1)
for arr in arr_arr
])
# There can be a subtle bug here when ret=0
def _ret_prob(arr):
ret = c_ed-(c_be-arr)
return pois_cache_ret[ret] if c_ed<MAX_CARS else ((1-pois_cache_cum_ret[ret-1]) if ret>0 else 1)
p_ret = np.array([_ret_prob(arr) for arr in arr_arr])
r_sa_sp = RENT_RWD*np.sum(arr_arr*p_arr*p_ret)
p_sa_sp = np.sum(p_arr*p_ret)
return r_sa_sp, p_sa_sp
def car_rental_trans1(c1, c2, mv, c1n, c2n):
"""
Calculate the transition probability and expected reward for the car rental problem from
state (c1, c2) to state (c1n, c2n) by moving mv cars from location 1 to location 2.
c1 is the number of cars at location 1.
c2 is the number of cars at location 2.
mv is the number of cars moved from location 1 to location 2.
c1n is the number of cars at location 1 at the end of the next day.
c2n is the number of cars at location 2 at the end of the next day.
"""
if ((mv>0 and c1<mv) or (mv<0 and c2<-mv)):
return 0.0, 0.0
c1_be = min(MAX_CARS, c1-mv)
c2_be = min(MAX_CARS, c2+mv)
r_sa_sp1, p_sa_sp1 = car_rental_one_loc_trans(c1_be, c1n, loc=1)
r_sa_sp2, p_sa_sp2 = car_rental_one_loc_trans(c2_be, c2n, loc=2)
p_sa_sp = p_sa_sp1*p_sa_sp2
# Be careful with the probability of the cost
r_sa_sp = r_sa_sp1*p_sa_sp2+r_sa_sp2*p_sa_sp1-MV_COST*abs(mv)*p_sa_sp
return r_sa_sp, p_sa_sp
def car_rental_pol_eval(
car_rental_trans_func: callable,
arr_pol: np.ndarray, arr_val: np.ndarray, ga: float, pol_iter: int,
tol: 1e-4, max_iter: int, memo_step: int = 1, plt_trace=False, trace_ij=(0, 0)
):
nr, nc = arr_pol.shape
arr_valn = arr_val.copy()
memo = {0: arr_valn[trace_ij]}
m_diff, avg_diff, tic = 0, 0, time.time()
# abs_tol = max(tol*RENT_RWD, 2**(-pol_iter))
abs_tol = tol*RENT_RWD
def _pol_eval_one_sweep(arr_val):
def _one_update(i, j):
arr_r_sa_sp, arr_p_sa_sp = np.zeros_like(arr_val), np.zeros_like(arr_val)
for ni, nj in product(range(nr), range(nc)):
r_sa_sp, p_sa_sp = car_rental_trans_func(i, j, arr_pol[i, j], ni, nj)
arr_r_sa_sp[ni, nj], arr_p_sa_sp[ni, nj] = r_sa_sp, p_sa_sp
return np.sum(arr_r_sa_sp+ga*arr_p_sa_sp*arr_val)
with parallel_backend('loky', n_jobs=N_JOBS):
res = Parallel(verbose=0, pre_dispatch="1.5*n_jobs")(
delayed(_one_update)(i, j) for i, j in product(range(nr), range(nc))
)
arr_valn = np.array(res).reshape(nr, nc, order='C')
return arr_valn
for si in range(max_iter):
if si%10==0:
print(f'Policy evaluation iteration {si}: {m_diff:.2f}:{avg_diff:.2f} ({time.time()-tic:.2f}s)')
m_diff, avg_diff, tic = 0, 0, time.time()
# for i, j in product(range(nr), range(nc)):
# # print(i, j)
# # valn = 0
# # for mv, ni, nj in product(range(-MAX_CAR_MOVE, MAX_CAR_MOVE+1), range(nr), range(nc)):
# # print(i, j, mv, ni, nj)
# # r_sa_sp, p_sa_sp = car_rental_trans(i, j, mv, ni, nj)
# # valn += r_sa_sp + ga*p_sa_sp*arr_valn[ni, nj]
# with parallel_backend('loky', n_jobs=N_JOBS):
# res = Parallel(verbose=0, pre_dispatch="1.5*n_jobs")(
# delayed(car_rental_trans)(i, j, arr_pol[i, j], ni, nj)
# for ni, nj in product(range(nr), range(nc))
# )
# arr_r_sa_sp, arr_p_sa_sp = zip(*res)
# arr_r_sa_sp, arr_p_sa_sp = np.array(arr_r_sa_sp), np.array(arr_p_sa_sp)
# valn = np.sum(arr_r_sa_sp+ga*arr_p_sa_sp*arr_valn.reshape(-1, order='C'))
# m_diff = max(m_diff, abs(valn-arr_valn[i, j]))
# avg_diff += abs(valn-arr_valn[i, j])
# arr_valn[i, j] = valn
arr_valn = _pol_eval_one_sweep(arr_val)
m_diff = np.max(np.abs(arr_valn-arr_val))
avg_diff = np.mean(np.abs(arr_valn-arr_val))
arr_val = arr_valn
if si%memo_step==0:
memo[si] = arr_valn[trace_ij]
if m_diff<abs_tol:
print(f'Converged at iteration {si}')
break
if plt_trace:
fig = make_subplots(rows=1, cols=1)
arr_si = np.array(sorted(memo.keys()))
vs = [memo[si] for si in arr_si]
fig.add_trace(go.Scatter(x=arr_si+1, y=vs, mode='lines+markers'), row=1, col=1)
fig.update_layout(
xaxis=dict(title='Iteration', type='log'),
title=f'Value Function at Iteration {trace_ij}', height=600, width=600
)
fig.show()
return arr_valn
def eval_act_val(car_rental_trans_func, arr_val, i, j, mv, ga):
nr, nc = arr_val.shape
q_sa = 0
trans_prob = 0
for ni, nj in product(range(nr), range(nc)):
r_sa_sp, p_sa_sp = car_rental_trans_func(i, j, mv, ni, nj)
q_sa += r_sa_sp + ga*p_sa_sp*arr_val[ni, nj]
trans_prob += p_sa_sp
return q_sa
# def car_rental_pol_impr(arr_pol: np.ndarray, arr_val: np.ndarray, ga: float):
# nr, nc = arr_pol.shape
# arr_poln = arr_pol.copy()
# pol_stab = True
# for i, j in product(range(nr), range(nc)):
# q_sa_memo = {}
# a_max, q_max = arr_pol[i, j], -np.inf
# for mv in range(-MAX_CAR_MOVE, MAX_CAR_MOVE+1):
# q_sa_memo[mv] = eval_act_val(arr_val, i, j, mv, ga)
# if q_sa_memo[mv]>q_max:
# q_max = q_sa_memo[mv]
# a_max = mv
# if q_max>q_sa_memo[arr_pol[i, j]]:
# arr_poln[i, j] = a_max
# pol_stab = False
# return pol_stab, arr_poln
def car_rental_pol_impr(car_rental_trans_func: callable, arr_pol: np.ndarray, arr_val: np.ndarray, ga: float):
nr, nc = arr_pol.shape
arr_poln = arr_pol.copy()
def _one_update(i, j):
q_sa_memo = {}
a_max, q_max = arr_pol[i, j], -np.inf
for mv in range(-MAX_CAR_MOVE, MAX_CAR_MOVE+1):
q_sa_memo[mv] = eval_act_val(car_rental_trans_func, arr_val, i, j, mv, ga)
if q_sa_memo[mv]>q_max:
a_max, q_max = mv, q_sa_memo[mv]
if q_max>arr_val[i, j]:
return a_max
else:
return arr_pol[i, j]
with parallel_backend('loky', n_jobs=N_JOBS):
res = Parallel(verbose=0, pre_dispatch="1.5*n_jobs")(
delayed(_one_update)(i, j) for i, j in product(range(nr), range(nc))
)
arr_poln = np.array(res).reshape(nr, nc, order='C')
pol_stab = np.all(arr_poln==arr_pol)
return pol_stab, arr_poln
def plot_arr_pol(arr_pol):
fig = go.Figure(data=go.Heatmap(
z=arr_pol,
colorscale='aggrnyl',
x=[i for i in range(MAX_CARS+1)],
y=[j for j in range(MAX_CARS+1)],
hoverongaps=False))
fig.update_layout(
title='Policy Heatmap',
xaxis=dict(title='Number of Cars at Location 2', showgrid=True, gridwidth=1, gridcolor='black'),
yaxis=dict(title='Number of Cars at Location 1', showgrid=True, gridwidth=1, gridcolor='black'),
autosize=False,
width=500,
height=500,
)
fig.show()
def plot_arr_val(arr_val):
fig = go.Figure(data=go.Heatmap(
z=arr_val,
colorscale='aggrnyl',
x=[i for i in range(MAX_CARS+1)],
y=[j for j in range(MAX_CARS+1)],
hoverongaps=False))
fig.update_layout(
title='Heatmap of arr_val',
xaxis=dict(title='Number of Cars at Location 2', showgrid=True, gridwidth=1, gridcolor='black'),
yaxis=dict(title='Number of Cars at Location 1', showgrid=True, gridwidth=1, gridcolor='black'),
autosize=False,
width=500,
height=500,
)
fig.show()
def car_rental_pol_iter(car_rental_trans_func, ga, max_pol_iter=-1, plt_iter=False, **pol_eval_kwargs):
"""
pol_eval_kwargs = {'tol': 1e-4, 'max_iter': 1000, 'memo_step': 1, 'plt_trace': False, 'trace_ij': (0, 0)}
"""
arr_pol = np.zeros((MAX_CARS+1, MAX_CARS+1), dtype=int)
arr_val = np.zeros((MAX_CARS+1, MAX_CARS+1), dtype=float)
# for i, j in product(range(MAX_CARS+1), range(MAX_CARS+1)):
# arr_val[i, j] = min(i, ARR1)+min(j, ARR2)
# arr_val *= RENT_RWD/(1-ga)
scnt = 0
while True:
scnt += 1
print("@"*10+f'Policy iteration {scnt}'+"@"*10)
arr_valn = car_rental_pol_eval(car_rental_trans_func, arr_pol, arr_val, ga, scnt, **pol_eval_kwargs)
pol_stab, arr_poln = car_rental_pol_impr(car_rental_trans_func, arr_pol, arr_valn, ga)
if plt_iter:
print("="*10+f'Iteration {scnt}'+"="*10)
plot_arr_pol(arr_pol)
plot_arr_val(arr_valn)
if pol_stab or (max_pol_iter>0 and scnt>=max_pol_iter):
print(f'Policy at iteration {scnt} (policy stable: {pol_stab})')
arr_val, arr_pol = arr_valn, arr_poln
break
arr_val_diff = arr_valn-arr_val
print(f'Policy at iteration {scnt} (policy stable: {pol_stab}) with min and max val improvement: {np.min(arr_val_diff):.2f}, {np.max(arr_val_diff):.2f}')
arr_val, arr_pol = arr_valn, arr_poln
return arr_pol, arr_val
In [23]:
arr_val = np.zeros((MAX_CARS+1, MAX_CARS+1), dtype=float)
for i, j in product(range(MAX_CARS+1), range(MAX_CARS+1)):
arr_val[i, j] = min(i, ARR1)+min(j, ARR2)
arr_val *= RENT_RWD/(1-0.9)
plot_arr_val(arr_val)
In [24]:
pol_eval_kwargs = {'tol': 1e-3, 'max_iter': 1000, 'memo_step': 10, 'plt_trace': True, 'trace_ij': (10, 10)}
_ = car_rental_pol_iter(car_rental_trans1, 0.9, max_pol_iter=5, plt_iter=True, **pol_eval_kwargs)
4.2. Add more details¶
In [27]:
PARKING_THRE = 10
EXTRA_PARKING_COST = 4
def car_rental_trans2(c1, c2, mv, c1n, c2n):
"""
Calculate the transition probability and expected reward for the car rental problem from
state (c1, c2) to state (c1n, c2n) by moving mv cars from location 1 to location 2.
c1 is the number of cars at location 1.
c2 is the number of cars at location 2.
mv is the number of cars moved from location 1 to location 2.
c1n is the number of cars at location 1 at the end of the next day.
c2n is the number of cars at location 2 at the end of the next day.
"""
if ((mv>0 and c1<mv) or (mv<0 and c2<-mv)):
return 0.0, 0.0
c1_be = min(MAX_CARS, c1-mv)
c2_be = min(MAX_CARS, c2+mv)
r_sa_sp1, p_sa_sp1 = car_rental_one_loc_trans(c1_be, c1n, loc=1)
r_sa_sp2, p_sa_sp2 = car_rental_one_loc_trans(c2_be, c2n, loc=2)
p_sa_sp = p_sa_sp1*p_sa_sp2
# Be careful with the probability of the cost
cost = MV_COST*abs(mv-int(mv>0))+EXTRA_PARKING_COST*(int(c1n>PARKING_THRE)+int(c2n>PARKING_THRE))
r_sa_sp = r_sa_sp1*p_sa_sp2+r_sa_sp2*p_sa_sp1-cost*p_sa_sp
return r_sa_sp, p_sa_sp
In [28]:
pol_eval_kwargs = {'tol': 1e-3, 'max_iter': 1000, 'memo_step': 10, 'plt_trace': True, 'trace_ij': (10, 10)}
_ = car_rental_pol_iter(car_rental_trans2, 0.9, max_pol_iter=5, plt_iter=True, **pol_eval_kwargs)
In [ ]:
In [ ]:
In [ ]: